Loading packages
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(janitor)
library(ClassifyR)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colMeans, colnames, colSums, dirname, do.call, duplicated,
## eval, evalq, Filter, Find, get, grep, grepl, intersect,
## is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
## paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
## Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which, which.max,
## which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: MultiAssayExperiment
## Loading required package: BiocParallel
Loading data
rawBattingData = read_csv("cricinfo-statsguru-data/Test Matches - Batting.csv") %>%
janitor::clean_names(case = "small_camel")
## Parsed with column specification:
## cols(
## Player = col_character(),
## `Career Span` = col_character(),
## `Career Start` = col_integer(),
## `Career End` = col_integer(),
## `Matches Played` = col_integer(),
## `Innings Batted` = col_character(),
## `Not Outs` = col_character(),
## `Runs Scored` = col_character(),
## `Highest Innings Score` = col_character(),
## `Highest Innings Score Num` = col_character(),
## `Batting Avg` = col_character(),
## `Hundreds Scored` = col_character(),
## `Scores Of Fifty Or More` = col_character(),
## `Ducks Scored` = col_character(),
## Country = col_character(),
## `Player Count` = col_integer(),
## `10000+ Runs Scored` = col_integer(),
## `50+ Batting Avg` = col_integer()
## )
glimpse(rawBattingData)
## Observations: 2,918
## Variables: 18
## $ player <chr> "AN Cook (2006-2018)", "GA Gooch (1975-...
## $ careerSpan <chr> "2006-2018", "1975-1995", "1990-2003", ...
## $ careerStart <int> 2006, 1975, 1990, 1978, 2005, 1964, 198...
## $ careerEnd <int> 2018, 1995, 2003, 1992, 2014, 1982, 200...
## $ matchesPlayed <int> 156, 118, 133, 117, 104, 108, 115, 118,...
## $ inningsBatted <chr> "282", "215", "235", "204", "181", "193...
## $ notOuts <chr> "16", "6", "21", "18", "8", "23", "7", ...
## $ runsScored <chr> "12145", "8900", "8463", "8231", "8181"...
## $ highestInningsScore <chr> "294", "333", "190", "215", "227", "246...
## $ highestInningsScoreNum <chr> "294", "333", "190", "215", "227", "246...
## $ battingAvg <chr> "45.65", "42.58", "39.54", "44.25", "47...
## $ hundredsScored <chr> "32", "20", "15", "18", "23", "22", "16...
## $ scoresOfFiftyOrMore <chr> "56", "46", "45", "39", "35", "42", "46...
## $ ducksScored <chr> "8", "13", "14", "7", "10", "10", "20",...
## $ country <chr> "England", "England", "England", "Engla...
## $ playerCount <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ x10000RunsScored <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ x50BattingAvg <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, ...
rawBowlingData = read_csv("cricinfo-statsguru-data/Test Matches - Bowling.csv") %>%
janitor::clean_names(case = "small_camel")
## Parsed with column specification:
## cols(
## Player = col_character(),
## `Innings Bowled In` = col_character(),
## `Balls Bowled` = col_character(),
## `Runs Conceded` = col_character(),
## `Wickets Taken` = col_character(),
## `Best Bowling In An Innings` = col_character(),
## `Best Bowling In A Match` = col_character(),
## `Bowling Avg` = col_character(),
## `Economy Rate` = col_character(),
## `Bowling Strike Rate` = col_character(),
## `Five Wickets In An Innings` = col_character(),
## `Ten Wickets In A Match` = col_character(),
## `300+ Wickets Taken` = col_integer(),
## `<25.00 Bowling Avg` = col_integer()
## )
glimpse(rawBowlingData)
## Observations: 2,918
## Variables: 14
## $ player <chr> "JM Anderson (2003-2018)", "SCJ Broad (...
## $ inningsBowledIn <chr> "257", "215", "168", "165", "127", "151...
## $ ballsBowled <chr> "30398", "24346", "21815", "17357", "15...
## $ runsConceded <chr> "14705", "12050", "10878", "8190", "662...
## $ wicketsTaken <chr> "540", "417", "383", "325", "307", "297...
## $ bestBowlingInAnInnings <chr> "7/42", "8/15", "8/34", "8/43", "8/31",...
## $ bestBowlingInAMatch <chr> "11/71", "11/121", "13/106", "9/92", "1...
## $ bowlingAvg <chr> "27.23", "28.89", "28.40", "25.20", "21...
## $ economyRate <chr> "2.90", "2.96", "2.99", "2.83", "2.61",...
## $ bowlingStrikeRate <chr> "56.2", "58.3", "56.9", "53.4", "49.4",...
## $ fiveWicketsInAnInnings <chr> "25", "16", "27", "16", "17", "17", "17...
## $ tenWicketsInAMatch <chr> "3", "2", "4", "0", "3", "6", "3", "1",...
## $ x300WicketsTaken <int> 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ x25_00BowlingAvg <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, ...
rawAllRounderData = read_csv("cricinfo-statsguru-data/Test Matches - All Round.csv") %>%
janitor::clean_names(case = "small_camel")
## Parsed with column specification:
## cols(
## Player = col_character(),
## `Batting-Bowling Avg Diff` = col_double(),
## `All-Round Ind` = col_integer()
## )
glimpse(rawAllRounderData)
## Observations: 25
## Variables: 3
## $ player <chr> "AW Greig (1972-1977)", "IT Botham (1977...
## $ battingBowlingAvgDiff <dbl> 8.23, 5.14, 3.22, 0.52, -1.45, -8.28, 13...
## $ allRoundInd <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
inningsThres = 40
Cleaning data
cleanedBattingData = rawBattingData %>%
dplyr::mutate(
player = str_replace(player, " \\([^>]+\\)", ""),
inningsBatted = as.integer(inningsBatted),
notOuts = as.integer(notOuts),
runsScored = as.numeric(runsScored),
battingAvg = as.numeric(battingAvg),
hundredsScored = as.integer(hundredsScored),
scoresOfFiftyOrMore = as.integer(scoresOfFiftyOrMore),
ducksScored = as.integer(ducksScored)
) %>%
dplyr::select(
-careerSpan,
-highestInningsScore,
-playerCount) %>%
dplyr::filter(inningsBatted > inningsThres)
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
glimpse(cleanedBattingData)
## Observations: 620
## Variables: 15
## $ player <chr> "AN Cook", "GA Gooch", "AJ Stewart", "D...
## $ careerStart <int> 2006, 1975, 1990, 1978, 2005, 1964, 198...
## $ careerEnd <int> 2018, 1995, 2003, 1992, 2014, 1982, 200...
## $ matchesPlayed <int> 156, 118, 133, 117, 104, 108, 115, 118,...
## $ inningsBatted <int> 282, 215, 235, 204, 181, 193, 212, 205,...
## $ notOuts <int> 16, 6, 21, 18, 8, 23, 7, 24, 15, 16, 6,...
## $ runsScored <dbl> 12145, 8900, 8463, 8231, 8181, 8114, 77...
## $ highestInningsScoreNum <chr> "294", "333", "190", "215", "227", "246...
## $ battingAvg <dbl> 45.65, 42.58, 39.54, 44.25, 47.28, 47.7...
## $ hundredsScored <int> 32, 20, 15, 18, 23, 22, 16, 22, 22, 22,...
## $ scoresOfFiftyOrMore <int> 56, 46, 45, 39, 35, 42, 46, 46, 38, 24,...
## $ ducksScored <int> 8, 13, 14, 7, 10, 10, 20, 14, 9, 4, 15,...
## $ country <chr> "England", "England", "England", "Engla...
## $ x10000RunsScored <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ x50BattingAvg <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, ...
cleanedBowlingData = rawBowlingData %>%
dplyr::mutate(
player = str_replace(player, " \\([^>]+\\)", ""),
inningsBowledIn = as.integer(inningsBowledIn),
ballsBowled = as.integer(ballsBowled),
runsConceded = as.integer(runsConceded),
wicketsTaken = as.integer(wicketsTaken),
bowlingAvg = as.numeric(bowlingAvg),
economyRate = as.numeric(economyRate),
bowlingStrikeRate = as.numeric(bowlingStrikeRate),
fiveWicketsInAnInnings = as.integer(fiveWicketsInAnInnings),
tenWicketsInAMatch = as.integer(tenWicketsInAMatch),
isBowler = ifelse(wicketsTaken < 50, "Not bowler", "bowler")
) %>%
tidyr::separate(bestBowlingInAnInnings,
into = c("mostWicketsInnings", "mostWicketsInningsRuns"),
sep = "/") %>%
tidyr::separate(bestBowlingInAMatch,
into = c("mostWicketsMatch", "mostWicketsMatchRuns"),
sep = "/") %>%
na.omit() %>%
dplyr::filter(inningsBowledIn > inningsThres)
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1178 rows
## [398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412,
## 413, 414, 415, 416, 417, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1178 rows
## [398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412,
## 413, 414, 415, 416, 417, ...].
glimpse(cleanedBowlingData)
## Observations: 325
## Variables: 17
## $ player <chr> "JM Anderson", "SCJ Broad", "IT Botham"...
## $ inningsBowledIn <int> 257, 215, 168, 165, 127, 151, 109, 129,...
## $ ballsBowled <int> 30398, 24346, 21815, 17357, 15178, 2186...
## $ runsConceded <int> 14705, 12050, 10878, 8190, 6625, 7674, ...
## $ wicketsTaken <int> 540, 417, 383, 325, 307, 297, 255, 252,...
## $ mostWicketsInnings <chr> "7", "8", "8", "8", "8", "8", "6", "7",...
## $ mostWicketsInningsRuns <chr> "42", "15", "34", "43", "31", "51", "65...
## $ mostWicketsMatch <chr> "11", "11", "13", "9", "12", "13", "10"...
## $ mostWicketsMatchRuns <chr> "71", "121", "106", "92", "119", "71", ...
## $ bowlingAvg <dbl> 27.23, 28.89, 28.40, 25.20, 21.57, 25.8...
## $ economyRate <dbl> 2.90, 2.96, 2.99, 2.83, 2.61, 2.10, 2.9...
## $ bowlingStrikeRate <dbl> 56.2, 58.3, 56.9, 53.4, 49.4, 73.6, 60....
## $ fiveWicketsInAnInnings <int> 25, 16, 27, 16, 17, 17, 17, 9, 7, 15, 1...
## $ tenWicketsInAMatch <int> 3, 2, 4, 0, 3, 6, 3, 1, 1, 5, 1, 0, 1, ...
## $ x300WicketsTaken <int> 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ x25_00BowlingAvg <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, ...
## $ isBowler <chr> "bowler", "bowler", "bowler", "bowler",...
cleanedAllRounderData = rawAllRounderData %>%
dplyr::mutate(
player = str_replace(player, " \\([^>]+\\)", "")
)
Number batting data
numBattingData = cleanedBattingData %>%
dplyr::select_if(is.numeric) %>%
bind_cols(cleanedBattingData %>% select(player)) %>%
dplyr::mutate(
logRuns = log10(runsScored)
) %>%
dplyr::select(
-runsScored,
-careerStart,
-careerEnd
) %>%
dplyr::filter(
!is.infinite(logRuns)
) %>%
na.omit
dim(numBattingData)
## [1] 620 11
numBattingMatrix = numBattingData %>%
dplyr::select(-player) %>%
as.data.frame %>% as.matrix %>% scale
K means Clustering
kmeansObj = kmeans(x = numBattingMatrix, centers = 2)
kmeansObj
## K-means clustering with 2 clusters of sizes 492, 128
##
## Cluster means:
## matchesPlayed inningsBatted notOuts battingAvg hundredsScored
## 1 -0.3832445 -0.4030279 -0.1759804 -0.2665595 -0.3849074
## 2 1.4730960 1.5491385 0.6764246 1.0245879 1.4794876
## scoresOfFiftyOrMore ducksScored x10000RunsScored x50BattingAvg logRuns
## 1 -0.3850548 -0.1441265 -0.1462267 -0.2345727 -0.334925
## 2 1.4800545 0.5539863 0.5620590 0.9016389 1.287368
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 1 2 2 1 2
## [36] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 1 2 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 1
## [246] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [281] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 1 1 1 1 1 1
## [316] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1
## [351] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [386] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [421] 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## [456] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [491] 1 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [526] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
## [561] 2 2 2 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1
## [596] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 1964.390 2040.033
## (between_SS / total_SS = 35.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
PCA
pcaObj = prcomp(x = numBattingMatrix)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:S4Vectors':
##
## space
## The following object is masked from 'package:stats':
##
## lowess
venn(
list(
battingPlayers = numBattingData$player,
bowlingPlayers = cleanedBowlingData$player
)
)

pcaDataFrame = tibble(
pca1 = pcaObj$x[,1],
pca2 = pcaObj$x[,2],
player = numBattingData$player,
kmeans = as.factor(kmeansObj$cluster)
) %>%
dplyr::left_join(cleanedBowlingData, by = "player") %>%
dplyr::mutate(
isBowler = coalesce(isBowler, "Not bowler")
)
table(pcaDataFrame$isBowler,
pcaDataFrame$kmeans)
##
## 1 2
## bowler 198 21
## Not bowler 294 107
p1 = pcaDataFrame %>%
ggplot(aes(x = pca1,
y = pca2,
colour = isBowler,
shape = kmeans,
label = player)) +
geom_point()
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plotly::ggplotly(p1)
Supervised learning
# DMresults <- ClassifyR::runTests(numBattingMatrix,
# classes = factor(pcaDataFrame$isBowler),
# datasetName = "Batting",
# classificationName = "Different Means",
# permutations = 20, folds = 5,
# seed = 2018, verbose = 1)
# DMresults
library(SmokyScotch)
## Warning: replacing previous import 'magrittr::set_names' by
## 'purrr::set_names' when loading 'SmokyScotch'
## Warning: replacing previous import 'ggplot2::margin' by
## 'randomForest::margin' when loading 'SmokyScotch'
## Warning: replacing previous import 'dplyr::combine' by
## 'randomForest::combine' when loading 'SmokyScotch'
svmMultiResult = svmCV_multi(x = numBattingMatrix,
y = factor(pcaDataFrame$isBowler),
nFolds = 5, nExp = 100,
cores = 5)
logitMultiResult = logitCV_multi(
x = data.frame(numBattingMatrix),
y = as.integer(factor(pcaDataFrame$isBowler)) -1L,
nFolds = 5, nExp = 100,
cores = 5)
rfMultiResult = rfCV_multi(
x = data.frame(numBattingMatrix),
y = factor(pcaDataFrame$isBowler),
nFolds = 5, nExp = 100,
cores = 5)
svmMultiError = svmMultiResult %>% purrr::map_dbl("svmMeanError")
logitMultiError = logitMultiResult %>% purrr::map_dbl("logitMeanError")
rfMultiError = rfMultiResult %>% purrr::map_dbl("rfMeanError")
boxplot(
data.frame(svmMultiError,
logitMultiError,
rfMultiError)
)

predictMatrix1 = purrr::map(svmMultiResult, "svmPredictOrderedVector") %>%
purrr::map(as.character) %>%
do.call(rbind,.)
predictMatrix2 = purrr::map(logitMultiResult, "logitPredictIntOrderedVector") %>%
purrr::map(as.character) %>%
do.call(rbind,.)
predictMatrix3 = purrr::map(rfMultiResult, "rfPredictOrderedVector") %>%
purrr::map(as.character) %>%
do.call(rbind,.)
classifierMatrix = rbind(
binaryClassScores(y = factor(pcaDataFrame$isBowler),
predictMatrix = predictMatrix1),
binaryClassScores(y = as.integer(as.factor(pcaDataFrame$isBowler)) -1L,
predictMatrix = predictMatrix2),
binaryClassScores(y = factor(pcaDataFrame$isBowler),
predictMatrix = predictMatrix3)
)
rownames(classifierMatrix) = c("SVM", "Logit", "RF")
compareBinaryClassResults(
y = factor(pcaDataFrame$isBowler),
classifierMatrix)

svmData = cbind(pcaDataFrame,
svmScore = binaryClassScores(y = factor(pcaDataFrame$isBowler),
predictMatrix = predictMatrix1)) %>% mutate(isAllRounder = player %in% cleanedAllRounderData$player)
svmData %>%
group_by(isAllRounder) %>%
summarise(
meanSvmScore = mean(svmScore)
)
## # A tibble: 2 x 2
## isAllRounder meanSvmScore
## <lgl> <dbl>
## 1 FALSE 0.870
## 2 TRUE 0.131
svmData %>%
ggplot(aes(x = isAllRounder,
y = svmScore)) +
geom_boxplot()
